Data Exploration
# load data
data <- rbind(read.csv("data/2019_09.csv"),
read.csv("data/2019_10.csv"))
# load station data
stations <- read.csv("data/stops.csv") %>%
filter(stop_id %in% unique(data$to_id)) %>%
st_as_sf(coords = c("stop_lon", "stop_lat")) %>%
st_set_crs("EPSG:4326") %>%
st_transform(nj_crs) %>%
dplyr::select(-stop_code, -stop_desc, -zone_id) %>%
mutate(cvID = sample(round(nrow(.) / 1.15), # random CV ID (out of 100)
size=nrow(.), replace = TRUE))
# get counties geometry of states where NJ rail operates
stateCounties <- rbind(tigris::counties(state = 34),
tigris::counties(state = 36),
tigris::counties(state = 42)
) %>%
dplyr::select(GEOID, NAME, geometry) %>%
st_transform(st_crs(nj_crs))
# spatial join GEOIDs of counties to stations in them
railCounties <- stations %>%
st_join(stateCounties)%>%
dplyr::select(-stop_name, -NAME, -cvID)
# get geometry back for those counties
counties <- railCounties %>%
distinct(GEOID) %>%
left_join(stateCounties) %>%
st_sf()
# read in rail line map
njLines <- st_read("https://opendata.arcgis.com/datasets/e6701817be974795aecc7f7a8cc42f79_0.geojson") %>%
st_transform(nj_crs)
# data wrangling
# process trains data
vars <- c("date", # take out
"train_id", # KEEP to infer direction
"stop_sequence", # take out (deficient/uneven data upon inspection)
"from", # redundant with from_id, take out
"from_id", # KEEP
"to", # redundant with to_id, take out
"to_id", # KEEP
"scheduled_time", # take out after using
"actual_time", # take out after using
"delay_minutes", # DEPENDENT VARIABLE <- convert
"status", # Get rid of cancelled trains
"line", # KEEP
"type") # take out
njData <- data %>%
filter(type == "NJ Transit") %>% # exclude Amtrak trains
filter(status != 'cancelled') %>% # exclude cancelled trains
mutate(time = ymd_hms(scheduled_time), # get time units from scheduled_time
week = week(time),
dotw = wday(time, label = T),
interval60 = floor_date(time, unit='hour'),
interval30 = floor_date(time, unit='30 mins'),
interval15 = floor_date(time, unit='15 mins')) %>%
rename('delay' = delay_minutes) %>%
left_join(stations, by=c('to_id'='stop_id')) %>% # join station data
mutate(station = paste(to_id, line, sep="_")) %>% # create line-station combinations
left_join(st_drop_geometry(railCounties), by=c('to_id'='stop_id')) %>% # join counties (for LOGO-CV 1)
dplyr::select(-train_id,
-date, # take out unused variables
-stop_sequence,
-from,
-to,
-scheduled_time,
-actual_time,
-type,
-time,
-stop_name) %>%
filter(week %in% c(39:43)) %>% # select 5 week period
na.omit(delay) # drop trains with missing values
# set lines to analyze
lines <- c("Northeast Corrdr",
"Atl. City Line",
"Princeton Shuttle",
"No Jersey Coast",
"Morristown Line",
"Pascack Valley",
"Raritan Valley",
"Main Line",
"Montclair-Boonton",
"Bergen Co. Line ",
"Gladstone Branch",
"Meadowlands Rail")
# get weather data
weatherData <-
riem_measures(station = "EWR", date_start = "2019-09-24", date_end = "2019-10-29")
# convert into panel
weatherPanel <-
weatherData %>%
mutate_if(is.character, list(~replace(as.character(.), is.na(.), "0"))) %>% # convert string NAs to NAs and then to 0s
replace(is.na(.), 0) %>% # convert NAs to 0s
mutate(interval60 = ymd_h(substr(valid, 1, 13))) %>% # round to hour intervals
mutate(interval30 = interval60) %>% # assume no changes withing half hours
mutate(week = week(interval60), # get week
dotw = wday(interval60, label=TRUE)) %>% # get day of the week
group_by(interval60) %>% # group by hour
summarize(Temperature = max(tmpf), # summarize temperature, precipitation and windspeed
Precipitation = sum(p01i),
WindSpeed = max(sknt)) %>%
mutate(Temperature = ifelse(Temperature == 0, 42, Temperature))
# create charts by weather indicator
grid.arrange(top = "Weather Data: New Jersey, September & October 2019",
ggplot(weatherPanel, aes(interval60, Precipitation)) +
geom_line() +
labs(title="Precipitation", x="Hour", y="Precipitation") +
plotTheme() +
theme(panel.border = element_blank(),
panel.background = element_rect(fill = "#eeeeee")),
ggplot(weatherPanel, aes(interval60, WindSpeed)) +
geom_line() +
labs(title="Wind Speed", x="Hour", y="Wind Speed") +
plotTheme() +
theme(panel.border = element_blank(),
panel.background = element_rect(fill = "#eeeeee")),
ggplot(weatherPanel, aes(interval60, Temperature)) +
geom_line() +
labs(title="Temperature", x="Hour", y="Temperature (ºF)") +
plotTheme() +
theme(panel.border = element_blank(),
panel.background = element_rect(fill = "#eeeeee")))

# --- map delays by station ---
ggmap_center <- c(lon = -74.4, lat = 40.4)
g <- ggmap(get_googlemap(center = ggmap_center, maptype = "terrain", color = "bw", zoom = 9, scale = 4))
# g2 <- ggmap(get_googlemap(center = system_centroid, maptype = "terrain", color = "bw", zoom = 9, scale = 4))
g + geom_sf(data = rail_lines, inherit.aes = FALSE)
test <- stations
m +
geom_sf(data = rail_lines, inherit.aes = FALSE)
#
g +
#geom_sf(data = rail_lines, inherit.aes = FALSE, color = "gray10") +
geom_sf(data = filter(rail_lines, LINE_CODE == "NC"),
color = theme_red, size = 2, inherit.aes = FALSE) +
geom_sf(data = filter(rail_lines, LINE_CODE == "NE"),
color = theme_orange, size = 2, inherit.aes = FALSE) +
labs(x = "", y = "") +
mapTheme
g +
geom_sf(data = filter(rail_lines, LINE_CODE %in% c("NC", "NE")),
color = "gray40", size = 1, inherit.aes = FALSE, show.legend = FALSE) +
geom_sf(data = delays_by_station,
aes(size = mean_delay_minutes,
color = mean_delay_minutes),
inherit.aes = FALSE, show.legend = FALSE) +
scale_color_stepsn(n.breaks = 4, colors = theme_scale) +
scale_size_continuous(range = c(0.5, 8)) +
labs(x = "", y = "") +
mapTheme
g +
geom_sf(data = filter(rail_lines, LINE_CODE %in% c("NC", "NE")),
color = "gray40", size = 1, inherit.aes = FALSE, show.legend = FALSE) +
geom_sf(data = filter(data, train_id %in% c("3722", "7224")) %>% arrange(delay_minutes),
aes(size = delay_minutes,
color = delay_minutes),
inherit.aes = FALSE, show.legend = FALSE) +
scale_color_stepsn(n.breaks = 5, colors = theme_scale) +
scale_size_continuous(range = c(1, 10)) +
labs(x = "", y = "") +
mapTheme
+
scale_size_continuous() +
scale_color_stepsn(n.breaks = 3, colors = c("green", "orange", "red")) +
mapTheme
map_bbox <- st_as_sfc(st_bbox(st_union(rail_lines))) %>% st_sf() %>% st_transform("EPSG:4326")
map_centroid <- st_coordinates(st_centroid(map_bbox))
system_bbox <- st_as_sfc(st_bbox(st_union(rail_lines))) %>% st_sf() %>% st_transform("EPSG:4326")
system_centroid <- st_coordinates(st_centroid(system_bbox))
ggplot() +
geom_sf(data = map_bbox) +
geom_sf(data = rail_lines) +
#geom_sf(data = stations) +
geom_sf(data = st_centroid(st_union(rail_lines)), color = "red") +
geom_sf(data = st_centroid(st_convex_hull(st_union(rail_lines))), color = "blue") +
geom_sf(data = st_centroid(map_bbox), color = "green")
data %>%
ggplot(aes(scheduled_hour, delay_minutes, colour = line)) + geom_line() +
# geom_vline(data = mondays, aes(xintercept = monday)) +
labs(title="Rideshare trips by week: November-December",
subtitle="Dotted lines for Thanksgiving & Christmas",
x="Day", y="Trip Count") +
plotTheme() + theme(panel.grid.major = element_blank())
# sum delays per station per line per hour (don't have good info to assume direction)
d_station <- njData %>%
group_by(station, interval60) %>% # INCREMENT to half hours?
summarize(delayAggr = sum(delay))
# create empty panel with all possible time/space combinations
# 222 line-station combinations by 840 time intervals = 186480
basePanel <-
expand.grid(interval60 = unique(njData$interval60), # INCREMENT to half hours?
station = unique(njData$station))
# join trips information into panel by hour
tripsPanel <-
d_station %>%
right_join(basePanel) %>%
left_join(weatherPanel, by = "interval60") %>% # INCREMENT to half hours?
mutate(week = week(interval60),
dotw = wday(interval60, label = TRUE))
# create lag variables
delaysPanel <-
tripsPanel %>%
arrange(station, interval60) %>%
replace(is.na(.), 0) %>%
group_by(station) %>%
mutate(lagHour = dplyr::lag(delayAggr, 1),
lag2Hours = dplyr::lag(delayAggr, 2),
lag3Hours = dplyr::lag(delayAggr, 3),
lag4Hours = dplyr::lag(delayAggr, 4),
lag12Hours = dplyr::lag(delayAggr, 12),
lag1day = dplyr::lag(delayAggr, 24)) %>%
ungroup()
# version of delaysPanel with counties and point geometry for stations
delaysPanelSpatial <- delaysPanel %>%
mutate(id = as.integer(str_extract(station, '[0-9]*'))) %>%
left_join(railCounties, by=c("id"="stop_id")) %>%
st_sf()
# Partition the resulting data in two sets, training on 3 weeks and testing on the following 2
delaysTrain <- filter(delaysPanelSpatial, week <= 41)
delaysTest <- filter(delaysPanelSpatial, week > 41)
Exploratory Analysis
# TO PRODUCE CHARTS:
# just a table for the network delays
# a bar chart for lines
# a map and bar chart for stations (overall).
# Delays by Network, Lines and Station aggregates
selLines <- lines #c("No Jersey Coast", "Northeast Corrdr")
# the entire network
delaysNetwork <- njData %>%
summarize(totalDelay = sum(delay),
meanDelay = mean(delay))
# by Line
delaysLine <- njData %>%
group_by(line) %>%
summarize(totalDelay = sum(delay),
meanDelay = mean(delay)) %>%
arrange(desc(meanDelay))
# by Station
delaysStation <- njData %>%
filter(line %in% selLines) %>%
group_by(to_id) %>%
summarize(totalDelay = sum(delay),
meanDelay = mean(delay)) %>%
arrange(desc(meanDelay)) %>%
left_join(stations, by=c('to_id'='stop_id')) # add name and geometry to plot
# set beginning of 'week' according to january 1st
tuesdays <-
mutate(delaysPanel,
day = ifelse(dotw == "Tue" & hour(interval60) == 1,
interval60, 0)) %>%
filter(day != 0)
rbind(
mutate(delaysTrain, legend = "Training"),
mutate(delaysTest, legend = "Testing")) %>%
group_by(legend, interval60) %>%
summarize(delays = sum(delayAggr)) %>%
ungroup() %>%
ggplot(aes(interval60, delays, colour = legend)) +
geom_line() +
scale_colour_manual(values = palette2) +
geom_vline(data = tuesdays, aes(xintercept = day)) +
labs(title="Citi bike trips in Brooklyn by week",
subtitle = "5-week period in September-October 2019",
x="",
y="Trip Count") +
plotTheme() +
theme(legend.position = "bottom",
panel.grid.major = element_blank(),
panel.border = element_blank(),
panel.background = element_rect(fill = "#eeeeee")
)

# set the location of stations for labeling
PennSt <- c(-73.992358, 40.750046)
Ph30St <- c(-75.182327, 39.956565)
# get station points and aggregate delay
delaysPoints <- delaysPanelSpatial %>%
group_by(week, station) %>%
summarize(aggrDelays = sum(delayAggr)) %>%
ungroup()
njCounties <- stateCounties %>%
filter(GEOID < 35000)
otherCounties <- stateCounties %>%
filter(GEOID %in% c(36061,36071,36087,42017,42091,42101))
# side by side graduate symbol maps
delaysPoints %>%
ggplot() +
geom_sf(data=njCounties, colour = "#222222", fill = "#2f2f2f") + # BASE MAP goes here
geom_sf(data=njLines, colour = "#888888", fill = NA, size=0.66) + # BASE MAP goes here
geom_sf(pch = 21,
colour = 'NA',
alpha = 0.66,
aes(size = aggrDelays,
fill = aggrDelays)) +
facet_wrap(~week, ncol = 5) +
scale_fill_viridis_c(option = "plasma") +
scale_size_continuous(
range = c(0,4)) +
labs(title="NJ Rail: Amount of delay per station",
subtitle = "September-October 2019") +
guides(size = F,
fill=guide_colorbar(title="aggregate delay per station", barwidth = 20)) +
mapTheme() +
theme(legend.position = "bottom",
panel.border = element_blank(),
panel.background = element_rect(fill = "#222222"),
panel.grid = element_blank(),
strip.background = element_rect(fill = "#222222"),
strip.text.x = element_text(size = 12, color = '#eeeeee', hjust=0.01))

# filter bike data for just september 24 2019
week39 <- njData %>%
filter(week == 39 & dotw == "Tue")
# create empty panel with all station-time combinations
week39Panel <-
expand.grid(
interval15 = unique(week39$interval15),
station = as.character(unique(njData$station)))
# alternate mode of counting trips
week39Trips <- njData %>%
filter(week == 39) %>%
group_by(station, interval15) %>%
summarize(aggrDelay = sum(delay, na.rm=T))
# put data together for sept. 24
njAnimationData <-
week39Trips %>%
right_join(week39Panel) %>%
mutate(id = as.integer(str_extract(station, '[0-9]*'))) %>%
left_join(stations, by=c("id" = "stop_id")) %>%
st_sf()
# create map per 15 minute interval
animation <-
njAnimationData %>%
ggplot() +
geom_sf(data=njCounties, colour = "#222222", fill = "#2f2f2f") +
geom_sf(data=njLines, colour = "#666666", fill = NA, size=0.5) + # BASE MAP goes here
geom_sf(pch = 21,
colour = 'NA',
alpha = 0.8,
aes(size = aggrDelay,
fill = aggrDelay)) +
scale_fill_viridis_c(option = "plasma") +
scale_size_continuous(
range = c(0,7)) +
labs(title="NJ Rail: Amount of delay per station",
subtitle = "15 minute intervals: {current_frame}") +
guides(size = F,
fill=guide_colorbar(title="trips per station", barwidth = 10)) +
transition_manual(interval15) +
mapTheme() +
theme(legend.position = "bottom",
panel.border = element_blank(),
panel.background = element_rect(fill = "#222222"),
panel.grid = element_blank(),
strip.background = element_rect(fill = "#222222")
)
# plot animation
animate(animation, duration=20, renderer = gifski_renderer())

# temperature as a function of delays by week
delaysPanel %>%
group_by(interval60) %>%
summarize(meanDelay = mean(delayAggr),
temperature = first(Temperature)) %>%
mutate(week = week(interval60)) %>%
ggplot(aes(temperature, meanDelay)) +
geom_point(aes(color=temperature)) +
scale_color_gradient(low="#1b98e0", high="red") +
geom_smooth(method = "lm", se= FALSE, color='#ffffff') +
facet_wrap(~week, ncol=5) +
labs(title="NJ Rail delays as a fuction of temperature by week",
subtitle='September-October 2019',
x="Temperature", y="Mean Trip Count") +
plotTheme() +
theme(panel.border = element_blank(),
panel.background = element_rect(fill = "#222222"),
panel.grid = element_blank(),
panel.grid.major = element_blank(),
strip.background = element_rect(fill = "#222222"),
strip.text.x = element_text(size = 12, color = '#ffffff', hjust=0.01)
)

delaysPanel %>%
group_by(interval60) %>%
summarize(meanDelay = mean(delayAggr),
Precipitation = first(Precipitation)) %>%
mutate(isPrecip = ifelse(Precipitation > 0,"Rain/Snow", "None")) %>%
group_by(isPrecip) %>%
ggplot(aes(isPrecip, meanDelay, fill=isPrecip)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = c("None" = "#222222",
"Rain/Snow" = "#1b98e0")) +
labs(title='Variation of delay by precipitation',
x="Precipitation", y="Mean Delay Time (min)") +
plotTheme() +
theme(legend.position = "none",
panel.border = element_blank(),
panel.background = element_rect(fill = "#ffffff"),
panel.grid.major.x = element_blank(),
strip.text.x = element_text(size = 12)
)

Modeling
# Model A - just time (hour), day of the week and weather
reg1 <- lm(delayAggr ~
hour(interval60) +
dotw +
Temperature,
data = delaysTrain)
# Model B - just space (station), day of the week and weather
reg2 <- lm(delayAggr ~
station +
GEOID +
dotw + Temperature,
data = delaysTrain)
# Model C - time and space
reg3 <- lm(delayAggr ~
station +
GEOID +
hour(interval60) +
dotw +
Temperature,
data = delaysTrain)
# Model D - Lag variables
reg4 <- lm(delayAggr ~
station +
GEOID +
hour(interval60) +
dotw +
Temperature +
lagHour +
lag2Hours +
lag3Hours +
lag12Hours +
lag1day,
data = delaysTrain)
delaysTest_weekNest <-
as.data.frame(delaysTest) %>%
nest(-week)
# define function to return predictions based on a dataset of nested tibbles and a regression model
modelPred <- function(dat, fit){
pred <- predict(fit, newdata = dat)}
# return predictions into a tibble of tibbles
weekPredictions <-
delaysTest_weekNest %>%
mutate(A_Time_FE = map(.x = data, fit = reg1, .f = modelPred),
B_Space_FE = map(.x = data, fit = reg2, .f = modelPred),
C_Space_Time_FE = map(.x = data, fit = reg3, .f = modelPred),
D_Space_Time_Lags = map(.x = data, fit = reg4, .f = modelPred))
weekPredictions <- weekPredictions %>%
gather(Regression, Prediction, -data, -week) %>% # turn into long form by week
mutate(Observed = map(data, pull, delayAggr),
absoluteError = map2(Observed, Prediction, ~abs(.x - .y)), # apply absolute error function
MAE = map_dbl(absoluteError, mean), # get mean of absolute error
sd_AE = map_dbl(absoluteError, sd)) # get SD of absolute error
# chart Mean Absolute Errors by model specifications and Week
weekPredictions %>%
dplyr::select(week, Regression, MAE) %>%
gather(Variable, MAE, -Regression, -week) %>%
ggplot(aes(week, MAE)) +
geom_bar(aes(fill = Regression), alpha=.9, position = "dodge", stat="identity") +
scale_x_continuous(breaks = c(42,43)) +
scale_fill_manual(values = palette5) +
labs(title = "Mean Absolute Errors",
subtitle = 'by model specification and week') +
plotTheme() +
theme(legend.position = "bottom",
panel.border = element_blank(),
panel.background = element_rect(fill = "#222222"),
panel.grid = element_blank(),
panel.grid.major.x = element_blank(),
strip.text.x = element_text(size = 12)
)

# select best regression model and get value by station (or tract)
errors <- weekPredictions %>%
filter(Regression == "D_Space_Time_Lags") %>%
unnest %>%
st_sf()
# get total MAE per weeks 42 and 43
errorWeek <- errors %>%
dplyr::select(station, absoluteError, week, geometry) %>%
gather(Variable, Value, -station, -week, -geometry) %>%
group_by(Variable, station, week) %>%
summarize(MAE = mean(Value))
# get MAE per hour on Tuesday October 14
errorDay <- errors %>%
dplyr::select(station,
absoluteError,
geometry,
interval60)%>%
gather(Variable,
Value,
-interval60,
-station,
-geometry) %>%
filter(wday(interval60, label = TRUE) == "Tue" & week(interval60) == 42) %>%
group_by(hour = hour(interval60), station) %>%
summarize(MAE = mean(Value))
# map of error by weeks
errorWeek %>%
ggplot() +
geom_sf(data=njCounties, colour = "#222222", fill = "#3a3a3a") +
geom_sf(data=njLines, colour = "#888888", fill = NA, size=0.66) + # BASE MAP goes here
geom_sf(pch = 21,
colour = 'NA',
alpha = 0.75,
aes(size = MAE,
fill = MAE)) +
geom_text(
label="NY Penn St.",
x=PennSt[1]+.05,
y=PennSt[2]-.05,
size = 3,
color = "#eeeeee"
) +
geom_text(
label="Phl 30th St.",
x=Ph30St[1]-.05,
y=Ph30St[2]-.05,
size = 3,
color = "#eeeeee"
) +
facet_wrap(~week, ncol = 2) +
scale_fill_gradient(low='#91bfdb',
high='#fc8d59',
guide='colorbar') +
scale_size_continuous(range = c(0,6)) +
labs(title="Mean Absolute Error per week and station",
subtitle = "NJ Rail delays by station") +
guides(size=F,
fill=guide_colorbar(title="MAE", barwidth = 20)) +
mapTheme() +
theme(legend.position = "bottom",
panel.border = element_blank(),
panel.background = element_rect(fill = "#222222"),
panel.grid = element_blank(),
strip.background = element_rect(fill = "#222222"),
strip.text.x = element_text(size = 16, color = '#ffffff', hjust=0.01)
)

# make a map of MAES by hour of day
errorDay %>%
ggplot() +
geom_sf(data=njCounties, colour = "#222222", fill = "#3a3a3a") +
geom_sf(data=njLines, colour = "#888888", fill = NA, size=0.66) + # BASE MAP goes here
geom_sf(pch = 21,
colour = 'NA',
alpha = 0.75,
aes(size = MAE,
fill = MAE)) +
facet_wrap(~hour, ncol = 6) +
scale_fill_gradient(low='#91bfdb',
high='#fc8d59',
guide='colorbar') +
scale_size_continuous(range = c(0,4)) +
labs(title="Mean Absosulte Error per hour and station",
subtitle = "NJ Rail Lines - September 24th 2019") +
guides(size=F,
fill=guide_colorbar(title="MAE", barwidth = 20)) +
mapTheme() +
theme(legend.position = "bottom",
panel.border = element_blank(),
panel.background = element_rect(fill = "#222222"),
panel.grid = element_blank(),
strip.background = element_rect(fill = "#222222"),
strip.text.x = element_text(size = 12, color = '#ffffff', hjust=0.05)
)

# TODO: fix cv errors, incl this one:
# Quitting from lines 885-989 (MUSA-508_HW7.Rmd)
# Error: Problem with `filter()` input `..1`.
# ℹ Input `..1` is `dataset[[id]] != thisFold`.
# x Must extract column with a single valid subscript.
# x Subscript `id` has size 186480 but must be size 1.
# CrossValidations: by neighborhood or census tract.
# define cross validation formula
crossValidate <- function(dataset, id, dependentVariable, indVariables, indVariableName) {
allPredictions <- data.frame()
cvID_list <- unique(dataset[[id]])
for (i in cvID_list) {
thisFold <- i
cat("This hold out fold is", thisFold, "\n")
fold.train <- filter(dataset, dataset[[id]] != thisFold) %>% as.data.frame() %>%
dplyr::select(id, geometry, indVariables, dependentVariable)
fold.test <- filter(dataset, dataset[[id]] == thisFold) %>% as.data.frame() %>%
dplyr::select(id, geometry, indVariables, dependentVariable)
regression <-
lm(delayAggr ~ .,
data = fold.train %>%
dplyr::select(-geometry, -id))
thisPrediction <-
mutate(fold.test, Prediction = predict(regression, fold.test, type = "response"))
allPredictions <-
rbind(allPredictions, thisPrediction)
}
return(st_sf(allPredictions))
}
# random LOGO-CV
regVarsRand <- c('GEOID',
'interval60',
'dotw',
'Temperature',
'lagHour',
'lag2Hours',
'lag3Hours',
'lag12Hours',
'lag1day')
# spatial (counties) LOGO CV ## ERROR: "no simple features column present", might need to join back the station points???
regCVrandom <- crossValidate(
dataset = z,
id = "cvID",
dependentVariable = "delayAggr",
indVariables = regVarsRand) %>%
dplyr::select(cvID = cvID, delayAggr, Prediction, geometry)
# Run four regressions by model
# regression with LOGO-CV and no spatial features
regVars <- c('interval60',
'dotw',
'Temperature',
'lagHour',
'lag2Hours',
'lag3Hours',
'lag12Hours',
'lag1day')
# Counties LOGO-CV
regCVcounties <- crossValidate(
dataset = delaysPanelSpatial,
id = "GEOID",
dependentVariable = "delayAggr",
indVariables = regVars) %>%
dplyr::select(cvID = GEOID, delayAggr, Prediction, geometry)
# TODO:
# "Line 885:
# compute errors and MAE by station/hour
regCV1 <- regCVrandom %>%
st_drop_geometry() %>%
mutate(regression = 'spatial CV neighborhoods', # identify regression
interval60 = delaysPanelSpatial$interval60, # join time back
week = week(interval60)) %>%
mutate(station = delaysPanelSpatial$station) %>% # join stations back
rename('Observed' = delayAggr) %>%
mutate(absoluteError = abs(Observed - Prediction)) # get absolute error
# compute errors and MAE by station/hour
regCV2 <- regCVcounties %>%
st_drop_geometry() %>%
mutate(regression = 'spatial CV tracts', # identify regression
interval60 = delaysPanelSpatial$interval60, # join time back
week = week(interval60)) %>%
mutate(station = delaysPanelSpatial$station) %>% # join stations back
rename('Observed' = delayAggr) %>%
mutate(absoluteError = abs(Observed - Prediction)) # get absolute error